import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as mp
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from scipy.stats import zscore
from sklearn import svm
from sklearn import metrics
from sklearn.decomposition import PCA
dataSet = pd.read_csv('vehicle.csv')
print("The 5 point summary of the dataSet is: ")
dataSet.describe()
dataSet.head(10)
dataSet.info()
dataSet.isna().sum()
Hence, we can see there are null values in following attributes/ columns: circularity, distance_circularity, radius_ratio, scatter_ratio, elongatedness, pr.axis_rectangularity, scaled_variance, scaled_variance.1, scaled_radius_of_gyration, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2.
print ("The total number of the null values are:")
dataSet.isna().sum().sum()
Treating these nulls by replacing the same by mean of the data.
for n in dataSet.columns:
if dataSet[n].isna().sum() > 0:
dataSet[n].fillna(value = int(dataSet[n].mean()), inplace = True)
dataSet.isna().sum()
Now we can see there are no null values in the dataFrame.
#Getting the data type and other info of the data frame:
dataSet.info()
#Converting the class attribute to categorical type as it defines what is the category of the vehicle:
dataSet['class'] = pd.Categorical(dataSet['class'])
dataSet.head(10)
This is how our dataSet now looks- Without any null values (replaced all the nulls with the column mean) and class attribute converted to categorical type.
Now, we will be scaling the whole dataSet so as to make the values uniform.
dataSetScaled = dataSet.drop('class', axis = 1)
dataSetScaled = dataSetScaled.apply(zscore)
#The scaled dataSet is as followed:
dataSetScaled['class'] = dataSet['class']
dataSetScaled.head(10)
Now that the data is scaled, converting the categorical column 'class' to a numeric one by assigning each class of vehicle a number as follows:
dataSetScaled['class'].values
#There are 3 classes bus, car and van which will be converted as bus = 0, car = 1 and van = 2
replace_map = {'class':{'bus':0, 'car':1, 'van':2}}
dataSetScaled = dataSetScaled.replace(replace_map)
Now that the data is ready, finding the correlation of the dataSet to fing the highest correlating attributes to the target column 'Class'
dataSetScaled.corr()
#Plotting a heat map of the above correlation:
def plot_corr(data, size = 20):
corr = data.corr()
fig, ax = mp.subplots(figsize = (size, size))
ax.matshow(corr)
mp.xticks(range(len(corr.columns)), corr.columns)
mp.yticks(range(len(corr.columns)), corr.columns)
plot_corr(dataSetScaled)
Thus, for modeling the support vector machine, referring to the above correlation following attributes will be eliminated: compactness, max.length_rectangularity,skewness_about.1
def create_confusionMat(test=[], pred=[]):
confMatrix = metrics.confusion_matrix(test, pred, labels=[1, 0])
confM_data = pd.DataFrame(confMatrix, index = [i for i in ['1', '0']],
columns = [i for i in ['Predict 1', 'Predict 0']])
sb.heatmap(confM_data, annot=True, )
def cal_accuracy(test = [], pred = []):
print ("Accuracy of the model is: " + str(metrics.accuracy_score(test, pred)))
def classification_report_of_model(test=[], pred = []):
print ("The classificaiton report of the model is: ")
print (metrics.classification_report(test, pred, labels=[1, 0]))
X_Values = dataSetScaled.drop(['class', 'max.length_rectangularity', 'compactness', 'skewness_about.1'], axis = 1)
Y_Values = dataSetScaled['class']
x_train, x_test, y_train, y_test = train_test_split(X_Values, Y_Values, test_size = 0.2 ,random_state = 1)
Building a Support Vector machine:
svm_model = svm.SVC(gamma = 0.025, C = 3)
svm_model.fit(x_train, y_train)
svm_model.score(x_test,y_test)
pred = svm_model.predict(x_test)
#Following is the accuracy of the support vectr machine:
cal_accuracy(y_test, pred)
create_confusionMat(y_test, pred)
From the above confusion matrix we can infer that the SVM model: The total number of true positives are 86. This means, for 86 times the model is able to correctly predict the class of the vehicle. The total number of false positives are 0, that is the model didn't wrongly predict the class of the vehicle.
The total number of false negatives are 1. This means that only once the model predicts the class of the vehicle to be something else but is something different in actual.
The total number of True negatives are 35. This means for 35 times the model predicted the vehicle class not what it was considered to be.
Now, let's perform a K-Fold Cross Validation of the above Support Vector Machine model.
#Checking out scores of every K-fold cross validation that is K = 5 in out case:
scores = cross_val_score(svm_model, X_Values, Y_Values, cv = 5)
print("The Scores are: ", scores)
Thus, from the above scores of K-fold cross validation we're getting a maximum score of 97.05 when K = 5 for a support vector machine.
=======================================================================================================================
Now, using PCA that is principal component analysis to generate the Principal Components. This will be then used to model the SVM and later compared with the model one above.
#For PCA we don't need the target column and hence the dataSet without target column 'class' will be used:
X_dataSet_PCA = dataSetScaled.drop('class', axis = 1)
sb.pairplot(X_dataSet_PCA, diag_kind='kde')
#creating the covariance matrix of the above dataSet
covMat = np.cov(X_dataSet_PCA, rowvar= False)
print ("The covariance matrix is: ", covMat)
Finding the number of n_components that will comprise 95% of the variance:
pca = PCA()
pca.fit(X_dataSet_PCA)
print("The eigen values are: \n", pca.explained_variance_)
print("The eigen vectors are: \n", pca.components_)
mp.bar(list(range(0,18)),pca.explained_variance_ratio_,alpha=0.5, align='center' )
mp.ylabel('Variation explained')
mp.xlabel('eigen Value')
mp.show()
mp.step(list(range(0,18)), np.cumsum(pca.explained_variance_ratio_))
mp.ylabel('Cum of variation explained')
mp.xlabel('eigen Value')
mp.show()
Thus from the above plots, we can select first 9 components as they comprise of over 95% of the variance of the dataSet.
Now, building an SVM model with 9 principal components.
pca_9 = PCA(n_components=9)
pca_9.fit(X_dataSet_PCA)
print ("Following components will be conisdered: \n",pca_9.components_)
print ("The total ratio of these principal components is: \n", pca_9.explained_variance_ratio_)
#Transforming the dataSet with these pca_9 principal components:
dataSet_pca9_transformed = pca_9.transform(X_dataSet_PCA)
sb.pairplot(pd.DataFrame(dataSet_pca9_transformed))
Fitting the SVM model with the above pca 9 dataSet
dataSet_pca9_transformed = pd.DataFrame(dataSet_pca9_transformed)
dataSet_pca9_transformed.head(10)
dataSet_pca9_transformed['class'] = dataSetScaled['class']
dataSet_pca9_transformed.head(10)
Now, splitting the above dataSet_pca9_transformed using the train_test_split
X_Vals = dataSet_pca9_transformed.drop('class', axis =1)
Y_Vals = dataSet_pca9_transformed['class']
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(X_Vals, Y_Vals, test_size = 0.2, random_state = 1)
svm_model_pca = svm.SVC(gamma = 0.025, C = 3)
svm_model_pca.fit(x_train_pca,y_train_pca)
svm_model_pca.score(x_test_pca, y_test_pca)
pred = svm_model_pca.predict(x_test_pca)
svm_model_pca.score(x_test_pca, y_test_pca)
cal_accuracy(y_test_pca, pred)
create_confusionMat(y_test_pca, pred)
From the above confusion matrix we can infer that the SVM model: The total number of true positives are 86. This means, for 87 times the model is able to correctly predict the class of the vehicle. The total number of false positives are 0, that is the model didn't wrongly predict the class of the vehicle.
The total number of false negatives are 1. This means that only once the model predicts the class of the vehicle to be something else but is something different in actual.
The total number of True negatives are 36. This means for 36 times the model predicted the vehicle class not what it was considered to be.
Now finding the K-Fold cross validation of this SVM model based on the PCA dataSet:
#Checking out scores of every K-fold cross validation that is K = 5 in out case:
scores_pca = cross_val_score(svm_model_pca, X_Vals, Y_Vals, cv = 5)
print("The Scores are: ", scores_pca)
Thus, from the above scores of K-fold cross validation we're getting a maximum score of 96.47 when K = 5 for a support vector machine.
=======================================================================================================================
Thus, following inferences can be made from the above activity:
Accuracy for SVM with original data without PCA = 95.29
Accuracy for SVM with PCA9 data = 96.47
Thus, we gain accuracy of almost 1% by using principal components on the same support vector machine model.
Also, the K-Folds CV score of the SVM with original data is [0.94117647 0.96470588 0.97058824 0.95266272 0.96407186] and for the SVM with PCA data is [0.94705882 0.96470588 0.95882353 0.94674556 0.96407186] where the maximum score of the SVM with original data is higher.